{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "KG_COVID_19_Data_File_Generator.ipynb",
"provenance": [],
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "OKBsas4jb_5D"
},
"source": [
"## Download and preprocess the Dataset into files which can be loaded effectively in the memory"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "t-_csND5cC9s",
"outputId": "8906f326-dff8-44aa-c03e-f8dfcb856631"
},
"source": [
"!wget https://kg-hub.berkeleybop.io/kg-covid-19/current/kg-covid-19.tar.gz\r\n",
"!tar -xf kg-covid-19.tar.gz\r\n",
"!ls"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-12-10 02:18:27-- https://kg-hub.berkeleybop.io/kg-covid-19/current/kg-covid-19.tar.gz\n",
"Resolving kg-hub.berkeleybop.io (kg-hub.berkeleybop.io)... 54.192.86.123, 54.192.86.55, 54.192.86.74, ...\n",
"Connecting to kg-hub.berkeleybop.io (kg-hub.berkeleybop.io)|54.192.86.123|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 376275636 (359M) [application/gzip]\n",
"Saving to: ‘kg-covid-19.tar.gz.1’\n",
"\n",
"kg-covid-19.tar.gz. 100%[===================>] 358.84M 171MB/s in 2.1s \n",
"\n",
"2020-12-10 02:18:29 (171 MB/s) - ‘kg-covid-19.tar.gz.1’ saved [376275636/376275636]\n",
"\n",
"kg-covid-19.tar.gz merged-kg_edges.tsv sample_data\n",
"kg-covid-19.tar.gz.1 merged-kg_nodes.tsv\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ExYp9rPycXMf"
},
"source": [
"!head -10000000 merged-kg_edges.tsv > merged-kg_edges_head.tsv\r\n",
"!tail -n +10000000 merged-kg_edges.tsv > merged-kg_edges_tail.tsv"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "wbNuhE4jcdOO"
},
"source": [
"## Import packages and set edge columns"
]
},
{
"cell_type": "code",
"metadata": {
"id": "V3X8ZlDscYRg"
},
"source": [
"import pandas as pd\r\n",
"import json\r\n",
"pd.set_option('display.max_rows', 50)\r\n",
"\r\n",
"columns = ['id', 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'target_type', 'standard_units', 'subjectActivity', 'uo_units', 'assay_organism', 'neighborhood', 'assay', 'textmining', 'type', 'ECO_code', 'coexpression', 'cooccurence', 'edge_key', 'coexpression_transferred', 'experiments_transferred', 'standard_value', 'target_organism', 'publication', 'comment', 'Annotation_Properties', 'evidence', 'textmining_transferred', 'neighborhood_transferred', 'fusion', 'database_transferred', 'database', 'subj_exp_role', 'Assigned_by', 'num_participants', 'standard_relation', 'DB_References', 'combined_score', 'obj_exp_role', 'detection_method', 'experiments', 'objectActivity', 'standard_type', 'With', 'Date', 'homology', 'association_type', 'publications', 'target_pref_name']"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "hW2ljFZrcjXW"
},
"source": [
"## Load Nodes and Edges into dataframes"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SyEtnVQocivq",
"outputId": "66004cd5-c88d-4a69-f24a-f6f32ab56e21"
},
"source": [
"kg_nodes_df = pd.read_csv('merged-kg_nodes.tsv', sep='\\t', header=0, dtype={\"id\":\"string\",\t\"name\":\"string\",\t\"category\":\"string\",\t\"provided_by\":\"string\",\t\"TTD_ID\":\"string\",\t\"iri\":\"string\",\t\"doi\":\"string\",\t\"title\":\"string\",\t\"same_as\":\"string\",\t\"tissue_chembl_id\":\"string\",\t\"TDL\":\"string\",\t\"description\":\"string\",\t\"in_taxon\tassay_type\":\"string\",\t\"bao_format\":\"string\",\t\"confidence_score\":\"string\",\t\"molecular_formula\":\"string\",\t\"synonym\":\"string\",\t\"assay_chembl_id\":\"string\",\t\"canonical_smiles\":\"string\",\t\"pubmed_id\":\"string\",\t\"ncbi_taxid\":\"string\",\t\"bao_label\":\"string\",\t\"xrefs\":\"string\",\t\"inorganic_flag\":float,\t\"full_name\":\"string\",\t\"subsets\":\"string\",\t\"molecule_type\":\"string\",\t\"polymer_flag\":float,\t\"natural_product\":float})\r\n",
"kg_nodes_df = kg_nodes_df[kg_nodes_df['name'].notna()]\r\n",
"kg_nodes_df.drop(['doi', 'assay_type','title','tissue_chembl_id','TDL','in_taxon','assay_type','bao_format','confidence_score','assay_chembl_id','pubmed_id','bao_label'], axis=1)\r\n",
"kg_nodes_df['name'] = kg_nodes_df['name'].str.lower()\r\n",
"kg_nodes_df['node_type'] = kg_nodes_df.id.str.split(\":\").str[0]\r\n",
"\r\n",
"kg_edges_df = pd.read_csv('merged-kg_edges_head.tsv', sep='\\t', header=0)\r\n",
"kg_edges_df = kg_edges_df.drop(['id','neighborhood','textmining','coexpression', 'cooccurence','edge_key','coexpression_transferred','experiments_transferred','standard_value','textmining_transferred','neighborhood_transferred','fusion','database_transferred','database','num_participants','standard_relation','combined_score','experiments','With','Date','publications'], axis=1)\r\n",
"kg_edges_tail_df = pd.read_csv('merged-kg_edges_tail.tsv', sep='\\t')\r\n",
"kg_edges_tail_df.columns = columns\r\n",
"kg_edges_tail_df = kg_edges_tail_df.drop(['id','neighborhood','textmining','coexpression', 'cooccurence','edge_key','coexpression_transferred','experiments_transferred','standard_value','textmining_transferred','neighborhood_transferred','fusion','database_transferred','database','num_participants','standard_relation','combined_score','experiments','With','Date','publications'], axis=1)\r\n",
"kg_edges_df = kg_edges_df.append(kg_edges_tail_df)\r\n",
"kg_edges_tail_df = None\r\n",
"\r\n",
"print(len(kg_nodes_df), len(kg_edges_df))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (12,13) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n",
"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (0,8,23,26,32,38,39,41,46) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n",
"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (0,6,7,8,9,10,12,14,15,18,22,23,24,25,26,32,33,35,36,38,39,41,42,43,46,47,48) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"307818 21443188\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 425
},
"id": "A0eV37bKcuIB",
"outputId": "b581b13e-14d8-4360-cea8-b12d09cabe5f"
},
"source": [
"kg_nodes_df.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" name | \n",
" category | \n",
" provided_by | \n",
" TTD_ID | \n",
" iri | \n",
" doi | \n",
" title | \n",
" same_as | \n",
" tissue_chembl_id | \n",
" TDL | \n",
" description | \n",
" in_taxon | \n",
" assay_type | \n",
" bao_format | \n",
" confidence_score | \n",
" molecular_formula | \n",
" synonym | \n",
" assay_chembl_id | \n",
" canonical_smiles | \n",
" pubmed_id | \n",
" ncbi_taxid | \n",
" bao_label | \n",
" xrefs | \n",
" inorganic_flag | \n",
" full_name | \n",
" subsets | \n",
" molecule_type | \n",
" polymer_flag | \n",
" natural_product | \n",
" node_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ENSEMBL:ENSG00000004059 | \n",
" arf5 | \n",
" biolink:Gene | \n",
" STRING | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" ADP ribosylation factor 5 | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NCBIGene:381 | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" ENSEMBL | \n",
"
\n",
" \n",
" 1 | \n",
" ENSEMBL:ENSG00000143933 | \n",
" calm2 | \n",
" biolink:Gene | \n",
" STRING | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" calmodulin 2 | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NCBIGene:805 | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" ENSEMBL | \n",
"
\n",
" \n",
" 2 | \n",
" ENSEMBL:ENSG00000131089 | \n",
" arhgef9 | \n",
" biolink:Gene | \n",
" STRING | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" Cdc42 guanine nucleotide exchange factor 9 | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NCBIGene:23229 | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" ENSEMBL | \n",
"
\n",
" \n",
" 3 | \n",
" ENSEMBL:ENSG00000178607 | \n",
" ern1 | \n",
" biolink:Gene | \n",
" STRING | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" endoplasmic reticulum to nucleus signaling 1 | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NCBIGene:2081 | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" ENSEMBL | \n",
"
\n",
" \n",
" 4 | \n",
" ENSEMBL:ENSG00000147889 | \n",
" cdkn2a | \n",
" biolink:Gene | \n",
" STRING | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" cyclin dependent kinase inhibitor 2A | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NCBIGene:1029 | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" ENSEMBL | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id name ... natural_product node_type\n",
"0 ENSEMBL:ENSG00000004059 arf5 ... NaN ENSEMBL\n",
"1 ENSEMBL:ENSG00000143933 calm2 ... NaN ENSEMBL\n",
"2 ENSEMBL:ENSG00000131089 arhgef9 ... NaN ENSEMBL\n",
"3 ENSEMBL:ENSG00000178607 ern1 ... NaN ENSEMBL\n",
"4 ENSEMBL:ENSG00000147889 cdkn2a ... NaN ENSEMBL\n",
"\n",
"[5 rows x 31 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 218
},
"id": "xED6Or58cxis",
"outputId": "ed42160e-53ee-48e1-e269-f3c9491ac6e0"
},
"source": [
"kg_edges_df.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject | \n",
" edge_label | \n",
" object | \n",
" relation | \n",
" provided_by | \n",
" target_type | \n",
" standard_units | \n",
" subjectActivity | \n",
" uo_units | \n",
" assay_organism | \n",
" assay | \n",
" type | \n",
" ECO_code | \n",
" target_organism | \n",
" publication | \n",
" comment | \n",
" Annotation_Properties | \n",
" evidence | \n",
" subj_exp_role | \n",
" Assigned_by | \n",
" DB_References | \n",
" obj_exp_role | \n",
" detection_method | \n",
" objectActivity | \n",
" standard_type | \n",
" homology | \n",
" association_type | \n",
" target_pref_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ENSEMBL:ENSG00000004059 | \n",
" biolink:has_gene_product | \n",
" UniProtKB:P84085 | \n",
" RO:0002205 | \n",
" NCBI | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" ENSEMBL:ENSG00000143933 | \n",
" biolink:has_gene_product | \n",
" UniProtKB:P0DP24 | \n",
" RO:0002205 | \n",
" NCBI | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" ENSEMBL:ENSG00000131089 | \n",
" biolink:has_gene_product | \n",
" UniProtKB:O43307 | \n",
" RO:0002205 | \n",
" NCBI | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" ENSEMBL:ENSG00000178607 | \n",
" biolink:has_gene_product | \n",
" UniProtKB:O75460 | \n",
" RO:0002205 | \n",
" NCBI | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" ENSEMBL:ENSG00000147889 | \n",
" biolink:has_gene_product | \n",
" UniProtKB:P42771 | \n",
" RO:0002205 | \n",
" NCBI | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" subject ... target_pref_name\n",
"0 ENSEMBL:ENSG00000004059 ... NaN\n",
"1 ENSEMBL:ENSG00000143933 ... NaN\n",
"2 ENSEMBL:ENSG00000131089 ... NaN\n",
"3 ENSEMBL:ENSG00000178607 ... NaN\n",
"4 ENSEMBL:ENSG00000147889 ... NaN\n",
"\n",
"[5 rows x 28 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gA5yfYl1c3jJ"
},
"source": [
"## Some Helper Functions to generate the data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yBauLFAfc1CN"
},
"source": [
"def next_level_relations(edge_node_list, drug_nodes, drug_edges):\r\n",
"\r\n",
" drug_nodes = kg_nodes_df[kg_nodes_df.id.isin(edge_node_list)]\r\n",
"\r\n",
" subject_edges = kg_edges_df[kg_edges_df[\"subject\"].isin(edge_node_list)]\r\n",
" object_edges = kg_edges_df[kg_edges_df[\"object\"].isin(edge_node_list)]\r\n",
" drug_edges = pd.concat([subject_edges, object_edges])\r\n",
" #subject_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"subject\"].isin(edge_node_list)]\r\n",
" #object_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"object\"].isin(edge_node_list)]\r\n",
" #drug_edges = pd.concat([subject_edges, object_edges, subject_tail_edges, object_tail_edges])\r\n",
"\r\n",
" return drug_nodes, drug_edges\r\n",
"\r\n",
"\r\n",
"def create_data_json(term, depth=0, generate_flag=False):\r\n",
" drug_nodes = kg_nodes_df[kg_nodes_df.name.str.contains(term.lower())]\r\n",
" node_list = list(drug_nodes.id)\r\n",
"\r\n",
" subject_edges = kg_edges_df[kg_edges_df[\"subject\"].isin(node_list)]\r\n",
" object_edges = kg_edges_df[kg_edges_df[\"object\"].isin(node_list)]\r\n",
" drug_edges = pd.concat([subject_edges, object_edges])\r\n",
" #subject_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"subject\"].isin(node_list)]\r\n",
" #object_tail_edges = kg_edges_tail_df[kg_edges_tail_df[\"object\"].isin(node_list)]\r\n",
" #drug_edges = pd.concat([subject_edges, object_edges, subject_tail_edges, object_tail_edges])\r\n",
"\r\n",
" for depth_iter in range(depth):\r\n",
" edge_node_list = list(set(list(drug_edges.subject) + list(drug_edges.object)))\r\n",
" drug_nodes, drug_edges = next_level_relations(edge_node_list, drug_nodes, drug_edges)\r\n",
"\r\n",
" edge_node_list = list(set(list(drug_edges.subject) + list(drug_edges.object)))\r\n",
" drug_nodes = kg_nodes_df[kg_nodes_df.id.isin(edge_node_list)]\r\n",
"\r\n",
" #drug_edges = drug_edges[drug_edges.publication != \"\"]\r\n",
" \r\n",
" links_df = drug_edges[['subject','edge_label','object','relation','provided_by','association_type','publication','ECO_code']]\r\n",
" links_df.columns = ['source', 'value', 'target','relation','provided_by','association_type','publication','ECO_code']\r\n",
" edges_df = drug_nodes[[\"id\",\"name\",\"category\",\"provided_by\"]]\r\n",
" edges_df.columns = ['id', 'name', 'group', 'provided_by']\r\n",
"\r\n",
" data = {}\r\n",
" data[\"nodes\"] = json.loads(edges_df.to_json(orient=\"records\"))\r\n",
" data[\"links\"] = json.loads(links_df.to_json(orient=\"records\"))\r\n",
"\r\n",
" node_ids = [data[\"nodes\"][i][\"id\"] for i in range(len(data[\"nodes\"]))]\r\n",
"\r\n",
" trimmed_links = []\r\n",
" for i in range(len(data[\"links\"])):\r\n",
" if data[\"links\"][i][\"source\"] in node_ids and data[\"links\"][i][\"target\"] in node_ids:\r\n",
" trimmed_links.append(data[\"links\"][i])\r\n",
" data[\"links\"] = trimmed_links\r\n",
"\r\n",
" if generate_flag or len(data[\"links\"]) > 100:\r\n",
" with open( term + \".json\", \"w\") as outfile:\r\n",
" json.dump(data, outfile, indent=4)\r\n",
" return len(data[\"nodes\"]), len(data[\"links\"])\r\n",
"\r\n",
"def generate_bubble_chart_data(data_type, column):\r\n",
" if data_type == \"nodes\":\r\n",
" data_df = kg_nodes_df[column].value_counts().to_frame('counts')\r\n",
" else:\r\n",
" data_df = kg_edges_df[column].value_counts().to_frame('counts')\r\n",
" #head_data_df = kg_edges_df[column].value_counts().to_frame('counts')\r\n",
" #tail_data_df = kg_edges_tail_df[column].value_counts().to_frame('counts')\r\n",
" #data_df = pd.concat([head_data_df, tail_data_df])\r\n",
" data_df[column] = list(data_df.index)\r\n",
" data_df = pd.concat([pd.Series(row['counts'], row[column].split('|')) for _, row in data_df.iterrows()]).reset_index()\r\n",
" data_df.columns = [column,\"counts\"]\r\n",
" data_df = data_df.groupby(column).sum()\r\n",
" data_df[column] = list(data_df.index)\r\n",
" data_df.reset_index(drop=True, inplace=True)\r\n",
" data_df.columns = [\"value\",\"id\"]\r\n",
" data_df.to_csv(\"%s_%s.csv\" % (data_type, column))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "WBEOKE9idBPL"
},
"source": [
"## Samples"
]
},
{
"cell_type": "code",
"metadata": {
"id": "NuodfirZdAfA"
},
"source": [
"create_data_json(\"ubql1_human\", generate_flag=True)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "EuI0-BuIdGnO"
},
"source": [
"generate_bubble_chart_data(\"nodes\",\"node_type\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4dQobM9fdXBD",
"outputId": "7b70615a-f58d-407f-eca6-7aab296329e1"
},
"source": [
"!ls -lh"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"total 8.3G\n",
"-rw-r--r-- 1 root root 359M Dec 3 22:19 kg-covid-19.tar.gz\n",
"-rw-r--r-- 1 root root 359M Dec 3 22:19 kg-covid-19.tar.gz.1\n",
"-rw-r--r-- 1 root root 2.0G Dec 10 02:19 merged-kg_edges_head.tsv\n",
"-rw-r--r-- 1 root root 1.8G Dec 10 02:19 merged-kg_edges_tail.tsv\n",
"-rw-r--r-- 1 114 120 3.8G Dec 3 03:22 merged-kg_edges.tsv\n",
"-rw-r--r-- 1 114 120 83M Dec 3 03:01 merged-kg_nodes.tsv\n",
"drwxr-xr-x 1 root root 4.0K Dec 2 22:04 sample_data\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VGEXAf-Yduei",
"outputId": "a543c3fd-f791-45ae-a166-afaaf900c76a"
},
"source": [
"print(len(set(list(kg_edges_df.subject) + list(kg_edges_df.object))), len(list(kg_nodes_df.id)))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"372799 307818\n"
],
"name": "stdout"
}
]
}
]
}